*===================================================================*
*   BIHAR EVALUATION OF SOCIAL FRANCHISING AND TELEMEDICINE (BEST)
*                      STANDARDIZED PATIENTS
* Creates stata files (dta) for Diarrhea and Pneumonia using the raw
* txt files
* Some minor cleaning - FIRST ROUND DATA
*====================================================================*

/* NOTES
(1) This do file creates the dta files for Diarrhea and Pneumonia using the raw txt files
(2) Stata will automatically recognize Row 1 as variable names. But, it will think that row 2 is data
(3) Read txt file READ_ME to understand how txt files were created. */

***********************************************
* Open SP Diarrhea (txt file)
***********************************************
  insheet using "$rawdatap1\Sp_diarrhea_Bihar_coded.txt", names clear

* Labelling variables using 2nd row
  foreach var of varlist * {
    local varlab = `var'[1]
    label var `var' "`varlab'"
    }

* Drop first row and ensure obs.=193
  drop in 1
  duplicates report
  assert r(unique_value) == 193

* Sample
  destring s2 c1 c2, replace
  gen     sample2 = 1
  replace sample2 = 2 if s1==""
  replace sample2 = 2 if s2==2 | s2==.
  replace sample2 = 2 if c1==. & c2==.      // correct statement but all missing
  lab var sample2 "Sample"
  order sample2, first
  tab sample2

* Homogenize variables names
  rename aux  form_num
  rename qq7  prov_id
  rename qq1  district_name
  rename qq1a district
  rename qq2  block_name
  rename qq2a block
  rename qq3  village_name
  rename qq3a village
  rename qq4  cluster_name
  rename qq4a cluster
  rename aux1 qq6_1
  rename aux2 qq6_2
  rename aux3 qq6_3
  destring t1_2mlgm, replace ignore("mg")
  destring t2_1mlgm, replace ignore(" mg")
  destring t2_2mlgm, replace ignore(" mg")
  replace village=subinstr(village, "O", "0",.)
  destring village, replace

* Checking prov_id (cluster)
/*note: according to prov_id in provider's interview and vignettes
  block 93 only has cluster 398
  change id 16106 by 16102
  change cluster 298 by 398
  change cluster 19 by 150*/
  gen length=length(prov_id)
  gen     aux = substr(prov_id, 1, 1) if length==3
  replace aux = substr(prov_id, 1, 2) if length==4
  replace aux = substr(prov_id, 1, 3) if length==5
  gen aux2=1 if cluster!=aux
  gen     aux3 = substr(prov_id, 2, 2) if length==3
  replace aux3 = substr(prov_id, 3, 2) if length==4
  replace aux3 = substr(prov_id, 4, 2) if length==5
  replace aux="95"  if aux=="095" & aux2==1
  replace aux="150" if aux=="19" & aux2==1
  replace aux="398" if aux=="298"
  replace aux3="02" if aux3=="06" & aux=="161"
  drop prov_id
  gen prov_id=aux+aux3
  lab var prov_id "Provider id"
  order prov_id, after(cluster)
  drop aux aux2 aux3 length

* Identify disease
  gen     case = 1
  lab var case "Type of disease"
  lab def case 1 "Diarrhea" 2 "Pneumonia", replace
  lab val case case

* Destring/tostring
  destring, replace
  tostring t2_5med q3mh4, replace
  replace t2_5med = subinstr(t2_5med, ".", "", .)

* Save file
  save "$prodata1\standardized_patient_diarrhea", replace






***********************************************
* Open SP Pneumonia (txt file)
***********************************************
  insheet using "$rawdatap1\Sp_pneumonia_Bihar_coded.txt", names clear

* Labelling variables using 2nd row
  foreach var of varlist * {
    local varlab = `var'[1]
    label var `var' "`varlab'"
    }

* Drop first row and ensure obs.=190
  drop in 1
  duplicates report
  assert r(unique_value) == 190

* Sample
  destring s2 c1 c2, replace
  gen     sample3 = 1
  replace sample3 = 2 if s1==""
  replace sample3 = 2 if s2==2 | s2==.
  replace sample3 = 2 if c1==. & c2==.      // correct statment but all missing
  lab var sample3 "Sample"
  order sample3, first
  tab sample3

* Homogenize variables names
  rename a form_num
  replace qq7="" if qq7=="00:00"
  rename qq7 prov_id
  rename qq1 district_name
  rename qq1a district
  rename qq2 block_name
  rename qq2a block
  rename qq3 village_name
  rename qq3a village
  rename qq4 cluster_name
  rename qq4a cluster
  rename t4 t5
  rename t3 t4
  gen t3=.
  replace t1_1dose="" if t1_1dose=="05-Jan"
  destring t1_3mlgm, replace ignore(" mg")
  destring t2_4mlgm, replace ignore(" mg")
  replace village=subinstr(village, "O", "0",.)
  destring village, replace

* Checking prov_id (cluster)
/* note: block 93 only has cluster 398*/ 
  gen length=length(prov_id)
  gen     aux = substr(prov_id, 1, 1) if length==3
  replace aux = substr(prov_id, 1, 2) if length==4
  replace aux = substr(prov_id, 1, 3) if length==5
  gen aux2=1 if cluster!=aux
  gen     aux3 = substr(prov_id, 2, 2) if length==3
  replace aux3 = substr(prov_id, 3, 2) if length==4
  replace aux3 = substr(prov_id, 4, 2) if length==5
  replace cluster="151" if (cluster=="1323" | cluster=="1329") & aux2==1
  replace aux="150" if aux=="19" & aux2==1
  replace aux="348" if aux=="" & aux2==1
  replace aux3="04" if aux=="348" & aux2==1
  replace aux="398" if aux=="298"
  drop prov_id
  gen prov_id=aux+aux3
  lab var prov_id "Provider id"
  order prov_id, after(cluster)
  drop aux aux2 aux3 length

/*
* Identify Medical History Questions for Pneumonia
  renpfix h p_h
*/

* Identify disease
  gen     case = 2
  lab var case "Type of disease"
  lab val case case

* Destring/tostring
  destring, replace
  tostring t1_5med t2_5med t2_3dose, replace
  replace t1_5med = subinstr(t1_5med, ".", "", .)
  replace t2_5med = subinstr(t2_5med, ".", "", .)

* Save file
  compress
  save "$prodata1\standardized_patient_pneumonia", replace

